{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys \n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv('household_power_consumption.txt', sep=';', \n",
    "                  infer_datetime_format=True, \n",
    "                  low_memory=False, na_values=['nan','?'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Date</th>\n",
       "      <th>Time</th>\n",
       "      <th>Global_active_power</th>\n",
       "      <th>Global_reactive_power</th>\n",
       "      <th>Voltage</th>\n",
       "      <th>Global_intensity</th>\n",
       "      <th>Sub_metering_1</th>\n",
       "      <th>Sub_metering_2</th>\n",
       "      <th>Sub_metering_3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>16/12/2006</td>\n",
       "      <td>17:24:00</td>\n",
       "      <td>4.216</td>\n",
       "      <td>0.418</td>\n",
       "      <td>234.84</td>\n",
       "      <td>18.4</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>17.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>16/12/2006</td>\n",
       "      <td>17:25:00</td>\n",
       "      <td>5.360</td>\n",
       "      <td>0.436</td>\n",
       "      <td>233.63</td>\n",
       "      <td>23.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>16.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>16/12/2006</td>\n",
       "      <td>17:26:00</td>\n",
       "      <td>5.374</td>\n",
       "      <td>0.498</td>\n",
       "      <td>233.29</td>\n",
       "      <td>23.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>17.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>16/12/2006</td>\n",
       "      <td>17:27:00</td>\n",
       "      <td>5.388</td>\n",
       "      <td>0.502</td>\n",
       "      <td>233.74</td>\n",
       "      <td>23.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>17.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>16/12/2006</td>\n",
       "      <td>17:28:00</td>\n",
       "      <td>3.666</td>\n",
       "      <td>0.528</td>\n",
       "      <td>235.68</td>\n",
       "      <td>15.8</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>17.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         Date      Time  Global_active_power  Global_reactive_power  Voltage  \\\n",
       "0  16/12/2006  17:24:00                4.216                  0.418   234.84   \n",
       "1  16/12/2006  17:25:00                5.360                  0.436   233.63   \n",
       "2  16/12/2006  17:26:00                5.374                  0.498   233.29   \n",
       "3  16/12/2006  17:27:00                5.388                  0.502   233.74   \n",
       "4  16/12/2006  17:28:00                3.666                  0.528   235.68   \n",
       "\n",
       "   Global_intensity  Sub_metering_1  Sub_metering_2  Sub_metering_3  \n",
       "0              18.4             0.0             1.0            17.0  \n",
       "1              23.0             0.0             1.0            16.0  \n",
       "2              23.0             0.0             2.0            17.0  \n",
       "3              23.0             0.0             1.0            17.0  \n",
       "4              15.8             0.0             1.0            17.0  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Date                      object\n",
       "Time                      object\n",
       "Global_active_power      float64\n",
       "Global_reactive_power    float64\n",
       "Voltage                  float64\n",
       "Global_intensity         float64\n",
       "Sub_metering_1           float64\n",
       "Sub_metering_2           float64\n",
       "Sub_metering_3           float64\n",
       "dtype: object"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "for j in range(2,9):        \n",
    "        df.iloc[:,j]=df.iloc[:,j].fillna(df.iloc[:,j].mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Date                     0\n",
       "Time                     0\n",
       "Global_active_power      0\n",
       "Global_reactive_power    0\n",
       "Voltage                  0\n",
       "Global_intensity         0\n",
       "Sub_metering_1           0\n",
       "Sub_metering_2           0\n",
       "Sub_metering_3           0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "export_csv = df.to_csv ('C:/Users/anish/Downloads/Data Management and Big Data/Final Project/input_data.csv', index = None, header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 4.43 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "inputdf = pd.read_csv('input_data.csv')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
